from funasr import AutoModel

class CaptionExtract:
    def __init__(self, model_dir="iic/SenseVoiceSmall"):
        self.model_dir = model_dir
    def extract(self, file):
        #TODO 实现所有的参数可配置
        model = AutoModel(model="paraformer-zh", model_revision="v2.0.4",
                          vad_model="fsmn-vad", vad_model_revision="v2.0.4",
                          punc_model="ct-punc-c", punc_model_revision="v2.0.4",
                          spk_model="cam++", spk_model_revision="v2.0.2",
                          device="cuda:0"
                          )

        #TODO 实现生成参数可配置
        res = model.generate(
            input=file,
            cache={},
            language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
            use_itn=True,
            batch_size_s=60,
            merge_vad=False,  #
            merge_length_s=15,
            ban_emo_unk=True
        )
        # 返回生成的音频序列
        return res[0]["text"], res[0]["sentence_info"]

if __name__ == '__main__':
    captionExtract = CaptionExtract()
    captionExtract.extract(f"../files/demo.mp4.mp3")